library(gapminder)
library(plotly)
## Warning: package 'plotly' was built under R version 3.6.2
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.6.2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ tibble 3.0.3 ✔ purrr 0.3.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## Warning: package 'tibble' was built under R version 3.6.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks plotly::filter(), stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(tmaptools)
## Warning: package 'tmaptools' was built under R version 3.6.2
library(tmap)
## Warning: package 'tmap' was built under R version 3.6.2
library(sf)
## Warning: package 'sf' was built under R version 3.6.2
## Linking to GEOS 3.7.2, GDAL 2.4.2, PROJ 5.2.0
top_hits <- read_csv("https://raw.githubusercontent.com/reisanar/datasets/master/all_billboard_summer_hits.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## key = col_character(),
## mode = col_character(),
## track_uri = col_character(),
## key_mode = col_character(),
## playlist_name = col_character(),
## playlist_img = col_character(),
## track_name = col_character(),
## artist_name = col_character(),
## album_name = col_character(),
## album_img = col_character()
## )
## See spec(...) for full column specifications.
artists_states <- readxl::read_xlsx("/Users/katiedills/Desktop/Mini_Project2/data/artists_states.xlsx")
tail(top_hits)
artist_count <-top_hits %>%
group_by(artist_name) %>%
summarise(track_count = sum(n())) %>%
arrange(desc(track_count))
print(artist_count)
## # A tibble: 468 x 2
## artist_name track_count
## <chr> <int>
## 1 Rihanna 7
## 2 Elton John 6
## 3 Katy Perry 6
## 4 Mariah Carey 5
## 5 The Rolling Stones 5
## 6 Usher 5
## 7 Donna Summer 4
## 8 The Beatles 4
## 9 Wings 4
## 10 Bee Gees 3
## # … with 458 more rows
head(artists_states)
class(artist_count)
## [1] "tbl_df" "tbl" "data.frame"
artist_location <- right_join(artist_count, artists_states)
## Joining, by = "artist_name"
print(artist_location)
## # A tibble: 29 x 3
## artist_name track_count state
## <chr> <int> <chr>
## 1 Katy Perry 6 California
## 2 Mariah Carey 5 New York
## 3 Usher 5 Texas
## 4 Donna Summer 4 Massachusetts
## 5 Elvis Presley 3 Mississippi
## 6 Eminem 3 Missouri
## 7 Janet Jackson 3 Indiana
## 8 Madonna 3 Michigan
## 9 Prince 3 Minnesota
## 10 Stevie Wonder 3 Michigan
## # … with 19 more rows
After reviewing the top few artist by count of top hits, I decided to compare the dancebility and energy by artist. I chose Katy Perry, Mariah Carey, and Rihanna, due to my knowledge of their popular songs and knowing they are mostly upbeat songs. However after plotting the three I was surprised to see that Mariah Carey’s song has much lower energy levels compared to Katy and Rihanna, although Rihanna also had a few top hits that were not as high energy as Katy.
top_females <- filter(top_hits,
artist_name %in% c("Rihanna", "Katy Perry", "Mariah Carey"))
head(top_females)
my_plot <- ggplot(
data = top_females,
mapping = aes(x = danceability, y = energy,
color = artist_name)) +
geom_point() +
scale_x_log10() +
theme_minimal()
ggplotly(my_plot)
I then wanted to analyze the energy vs. loudness of the top hits. And logically, as the loudness increases so does the energy. Also interesting that of all the top hits there is a fairly good balance between energy and loudness with not to many outliers of being strongly towards either attribute.
ggplot(top_hits, aes(x = loudness, y = energy)) +
geom_point() +
geom_smooth(method = "lm",
formula = "y ~ x") +
geom_smooth() +
theme_minimal()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
***
My dataset did not have any location information, so in order to create a spatial relational graph I decided to take random artist name and create an additional file that included their birth place. Once I was able to do so I then used the shapefile to map out the relationship between where the artist is born and how many top hits they may have. It is very clear that California is the most common place for top hit artist to be born in, this may have to do with access to the music industry itself.
library(sf)
temp_shapefile <- tempfile()
download.file("https://www2.census.gov/geo/tiger/GENZ2018/shp/cb_2018_us_state_500k.zip", temp_shapefile)
unzip(temp_shapefile)
sf_states <- read_sf('cb_2018_us_state_500k.shp')
names(sf_states)[names(sf_states) == "NAME"] <- "state"
head(sf_states)
music_states <- merge(sf_states, artist_location, by.x = "state", by.y = "state")
head(music_states)
tm_shape(music_states) +
tm_polygons("track_count") +
tm_borders()+
tm_lines+
tm_view(projection = 2163)+
tmap_style("cobalt")
## tmap style set to "cobalt"
## other available styles are: "white", "gray", "natural", "col_blind", "albatross", "beaver", "bw", "classic", "watercolor"
## Warning in eval(substitute(expr), e): Scaling levels may be incorrect for this
## projection. Please specify a leaflet projection with leafletCRS for more control
## Warning: One tm layer group has duplicated layer types, which are omitted. To
## draw multiple layers of the same type, use multiple layer groups (i.e. specify
## tm_shape prior to each of them).
Overall this data set was very interesting to explore and is a large enough dataset that you could easily examine multiple different relationships and correlations between all of the aspects of the songs along with examining how they rank by the artist themselves. I didn’t have any specific results I was expecting but rather wanting to see what relationships stood out, such as all of Katy Perry’s top hits have high rankings for dancebility and energy compared to other top female artist. It was also interesting to see how the most successful artists are born in California, whether that is due to location and access to the music industry, or if it has more to do with the style of the music industry in the area and if it is more likely to be the most popular. Overall I struggled most with the spatial visualizations as the data did not already have any location formating so I collected my own data on the artist birthplace and created an addition csv to be joined.